axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4a.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
ggplot(df_user, aes(x = followers_count_one)) + # figure 4b
theme_bw() +
geom_histogram(aes(fill = Type, y=..density..),
position = 'identity',
binwidth = 0.1,
alpha = 0.7) +
scale_fill_manual(values = c('red', 'skyblue')) +
scale_x_continuous(trans='log10',
breaks = c(0, 1, 10, 100, 1000, 10000)) +
labs(x = '\nNumber of Followers',
y = 'Density\n') +
theme(legend.text = element_text(size = 40),
legend.title = element_text(size = 40),
legend.position = c(0.8, 0.8),
axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust=0.5),
axis.text.y = element_text(size = 40),
axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4b.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
ggplot(df_user, aes(x = favourites_count_one)) + # figure 4c
theme_bw() +
geom_histogram(aes(fill = Type, y = ..density..),
position = 'identity',
binwidth = 0.1,
alpha = 0.7) +
scale_fill_manual(values = c('red', 'skyblue')) +
scale_x_continuous(trans = 'log10',
labels = label_number(accuracy = 1),
breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
labs(x = '\nNumber of Likes',
y = 'Density\n') +
theme(legend.text = element_text(size = 40),
legend.title = element_text(size = 40),
legend.position = c(0.2, 0.8),
axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust=0.5),
axis.text.y = element_text(size = 40),
axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4c.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
ggplot(df_user, aes(x = statuses_count_one)) + # figure 4d
theme_bw() +
geom_histogram(aes(fill = Type, y = ..density..),
position = 'identity',
binwidth = 0.1,
alpha = 0.7) +
scale_fill_manual(values = c('red', 'skyblue')) +
scale_x_continuous(trans = 'log10',
labels = label_number(accuracy = 1),
breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
labs(x = '\nNumber of Tweets', y = 'Density\n') +
theme(legend.text = element_text(size = 40),
legend.title = element_text(size = 40),
legend.position = c(0.2, 0.8),
axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust = 0.5),
axis.text.y = element_text(size = 40),
axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4d.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
# figure a10 -------------------------
user_median <- data.frame(
Friends =
c(median(df_user[which(df_user$violent == 1), ]$friends_count),
median(df_user[which(df_user$violent == 0), ]$friends_count)),
Followers =
c(median(df_user[which(df_user$violent == 1), ]$followers_count),
median(df_user[which(df_user$violent == 0), ]$followers_count)),
Likes =
c(median(df_user[which(df_user$violent == 1), ]$favourites_count),
median(df_user[which(df_user$violent == 0), ]$favourites_count)),
Tweets =
c(median(df_user[which(df_user$violent == 1), ]$statuses_count),
median(df_user[which(df_user$violent == 0), ]$statuses_count))
)
row.names(user_median) <- c('Violent', 'Non-violent')
write.csv(user_median, paste0(path_output, 'tbla10.csv'))
closeAllConnections() # Close connection to log file
my_log <- file('kim_psrm_fig2_fig4_tbla6_tbla10_log.txt') # File name of output log
sink(my_log, append = TRUE, type = "output")
sink(my_log, append = TRUE, type = "message")
cat(readChar(rstudioapi::getSourceEditorContext()$path, file.info(rstudioapi::getSourceEditorContext()$path)$size))
###############################################################################
#### Replication Materials                                                 ####
#### Taegyoon Kim, 2022. Violent Political Rhetoric on Twitter.            ####
#### Political Science Research and Methods                                ####
###############################################################################
###############################################################################
################################### Set Up ####################################
###############################################################################
# packages -------------------------
lapply(
c('tidyr', 'dplyr', 'ggplot2', 'ggrepel', 'knitr', 'gridExtra', 'ggthemes',
'readr', 'quanteda', 'tm','wordtools', 'xtable', 'scales', 'magrittr'),
require,
character.only = TRUE
)
# load Fightin' Words function -------------------------
# Note: Define fw functions first. This is based on <Fightin' Words: Lexical
# Feature Selection and Evaluation for Identifying the Content of Political Conflict>.
# Relevant original codes can be found here:
# https://burtmonroe.github.io/TextAsDataCourse/Tutorials/TADA-FightinWords.nb.html
fwgroups <- function(dtm,
groups,
pair = NULL,
weights = rep(1, nrow(dtm)),
k.prior = .1) {
weights[is.na(weights)] <- 0
weights <- weights / mean(weights)
zero.doc <- rowSums(dtm) == 0 | weights == 0
zero.term <- colSums(dtm[!zero.doc, ]) == 0
dtm.nz <- apply(dtm[!zero.doc,!zero.term], 2, '*', weights[!zero.doc])
g.prior <- tcrossprod(rowSums(dtm.nz), colSums(dtm.nz)) / sum(dtm.nz)
#
g.posterior <- as.matrix(dtm.nz + k.prior * g.prior)
groups <- groups[!zero.doc]
groups <- droplevels(groups)
g.adtm <- as.matrix(aggregate(x = g.posterior, by = list(groups = groups), FUN = sum)[, -1])
rownames(g.adtm) <- levels(groups)
g.ladtm <- log(g.adtm)
g.delta <- t(scale( t(scale(g.ladtm, center = T, scale = F)), center = T, scale = F))
g.adtm_w <- -sweep(g.adtm, 1, rowSums(g.adtm)) # terms not w spoken by k
g.adtm_k <- -sweep(g.adtm, 2, colSums(g.adtm)) # w spoken by groups other than k
g.adtm_kw <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k
g.se <- sqrt(1 / g.adtm + 1 / g.adtm_w + 1 / g.adtm_k + 1 / g.adtm_kw)
g.zeta <- g.delta / g.se
g.counts <- as.matrix(aggregate(x = dtm.nz, by = list(groups = groups), FUN=sum)[, -1])
if (!is.null(pair)) {
pr.delta <- t(scale(t(scale(g.ladtm[pair,], center = T, scale = F)), center = T, scale = F))
pr.adtm_w <- -sweep(g.adtm[pair, ], 1, rowSums(g.adtm[pair, ]))
pr.adtm_k <- -sweep(g.adtm[pair, ], 2, colSums(g.adtm[pair, ])) # w spoken by groups other than k
pr.adtm_kw <- sum(g.adtm[pair, ]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
pr.se <- sqrt(1 / g.adtm[pair, ] + 1/pr.adtm_w + 1 / pr.adtm_k + 1 / pr.adtm_kw)
pr.zeta <- pr.delta / pr.se
return(list(zeta = pr.zeta[1,],
delta=pr.delta[1, ],
se = pr.se[1, ],
counts = colSums(dtm.nz),
acounts = colSums(g.adtm)))
}
else {
return(list(zeta = g.zeta,
delta = g.delta,
se = g.se,
counts = g.counts,
acounts = g.adtm))
}
}
makeTransparent <- function(someColor, alpha = 100){
newColor <- col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red = curcoldata[1], green = curcoldata[2],
blue = curcoldata[3], alpha = alpha, maxColorValue = 255)
}
)
}
fw.ggplot.groups <- function(fw.ch,
groups.use = as.factor(rownames(fw.ch$zeta)),
max.words = 50,
max.countrank = 400,
colorpalette = rep('black',length(groups.use)),
sizescale = 2,
title='',
subtitle = '',
caption = '') {
if (is.null(dim(fw.ch$zeta))) {## two-group fw object consists of vectors, not matrices
zetarankmat <- cbind(rank(-fw.ch$zeta), rank(fw.ch$zeta))
colnames(zetarankmat) <- groups.use
countrank <- rank(-(fw.ch$counts))
}
else {
zetarankmat <- apply(-fw.ch$zeta[groups.use,],1,rank)
countrank <- rank(-colSums(fw.ch$counts))
}
wideplotmat <- as_tibble(cbind(zetarankmat,countrank=countrank))
wideplotmat$term = names(countrank)
#rankplot <- gather(wideplotmat, party, zetarank, 1:ncol(zetarankmat))
rankplot <- gather(wideplotmat, groups.use, zetarank, 1:ncol(zetarankmat))
rankplot$plotsize <- sizescale*(50/(rankplot$zetarank)) ^ (1/4)
rankplot <- rankplot[rankplot$zetarank < max.words + 1 & rankplot$countrank<max.countrank + 1,]
rankplot$groups.use <- factor(rankplot$groups.use,levels = groups.use)
p <- ggplot(rankplot, aes((nrow(rankplot) - countrank) ^ 1, -(zetarank ^ 1), colour = groups.use)) +
geom_point(show.legend = F, size = sizescale / 2) +
theme_classic() +
ylim(-max.words, 40) +
facet_grid(groups.use ~ .) +
geom_text_repel(aes(label = term), max.overlaps = Inf, size = rankplot$plotsize, point.padding=.05,
box.padding = unit(0.20, 'lines'), show.legend=F) +
scale_colour_manual(values = alpha(colorpalette, .9)) +
#labs(x='Terms used more frequently overall →', y='Terms used more frequently by group →',  title=title, subtitle=subtitle , caption = caption)
labs(x=paste('Overall Frequency'), y=paste('Type-specific Frequency'),  title = title, subtitle = subtitle , caption = caption) +
theme(axis.ticks = element_blank(),
axis.text = element_blank(),
plot.title = element_text(hjust = 0.5),
text = element_text(size = 17.5),
strip.text.x = element_text(size = 17.5))
}
fw.keys <- function(fw.ch, n.keys = 10) {
n.groups <- nrow(fw.ch$zeta)
keys <- matrix('', n.keys, n.groups)
colnames(keys) <- rownames(fw.ch$zeta)
for (g in 1:n.groups) {
keys[,g] <- names(sort(fw.ch$zeta[g, ], dec = T)[1:n.keys])
}
keys
}
###############################################################################
############ Generate Figure 2 & Figure 4 & Table A6  & Table A10 #############
###############################################################################
# load data -------------------------
path_data <- '/Users/taegyoon/Google Drive/kim_psrm_replication/data/'
path_output <- '/Users/taegyoon/Google Drive/kim_psrm_replication/output/'
df_fw_notext <- read_csv(paste0(path_data, 'df_fw_notext.csv'),
col_types = cols(X1 = col_skip()))
df_fw_dfm <- readRDS(file = paste0(path_data, 'df_fw_dfm.rds'))
df_user <- read_csv(paste0(path_data, 'df_user.csv'),
col_types = cols(X1 = col_skip()))
# text pr-processing -------------------------
# Note: Twitter restrics sharing of raw tweets, I am only able to share an RDS for
# the document-term matrix (i.e., df_fw_dfm.rds). The three blocks of code lines
# below were used to transform raw tweet text to a document-term matrix.
# The data frame, df_fw_notext, only contains the type of tweets (violent vs.
# non-violent) without the text and corresponds
# df_fw$text_processed <- iconv(df_fw$text, # remove ampersands and emojis
#                             'latin1',
#                             'ASCII',
#                             sub = '')
# df_fw$text_processed <- removeWords(df_fw$text_processed, 'amp')
#  text_tokens <- tokens( # tokenization
#    df_fw$text_processed,
#    remove_punct = TRUE,
#    remove_symbols = TRUE,
#    remove_numbers = TRUE,
#    remove_url = TRUE)
#  df_dfm <- dfm( # document-term matrix
#   text_tokens,
#   tolower = TRUE,
#   stem = TRUE,
#   remove = stopwords(),
#   case_insensitive = TRUE)
# figure 2 -------------------------
fw <- fwgroups(df_fw_dfm, groups = as.factor(df_fw_notext$type))
fw_plot <- fw.ggplot.groups(
fw,
sizescale = 4,
max.words = 90,
max.countrank = 400,
colorpalette = c('dodgerblue3','red'))
fw_plot
ggsave(paste0(path_output, 'fig2.pdf'),
dpi = 600,
width = 12,
height = 10,
units = 'in')
# table a6 -------------------------
fw_keys <- fw.keys(fw, n.keys=30)
xtable(fw_keys)
write.csv(fw_keys, paste0(path_output, 'tbla6.csv'))
# figure 4 -------------------------
df_user$friends_count_one <- df_user$friends_count + 1
df_user$followers_count_one <- df_user$followers_count + 1
df_user$favourites_count_one <- df_user$favourites_count + 1
df_user$statuses_count_one <- df_user$statuses_count + 1
df_user$Type <- as.factor(ifelse(df_user$violent == 1, 'Violent', 'Non-violent'))
df_user$Type <- factor(df_user$Type, levels = c('Violent','Non-violent'))
ggplot(df_user, aes(x = friends_count_one)) + # figure 4a
theme_bw() +
geom_histogram(aes(fill = Type, y = ..density..),
position = 'identity',
binwidth = 0.1,
alpha = 0.7) +
scale_fill_manual(values = c('red', 'skyblue')) +
scale_x_continuous(trans = 'log10',
labels = label_number(accuracy = 1),
breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
labs(x = '\nNumber of Friends',
y = 'Density\n') +
theme(legend.text = element_text(size = 40),
legend.title = element_text(size = 40),
legend.position = c(0.8, 0.8),
axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust = 0.5),
axis.text.y = element_text(size = 40),
axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4a.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
ggplot(df_user, aes(x = followers_count_one)) + # figure 4b
theme_bw() +
geom_histogram(aes(fill = Type, y=..density..),
position = 'identity',
binwidth = 0.1,
alpha = 0.7) +
scale_fill_manual(values = c('red', 'skyblue')) +
scale_x_continuous(trans='log10',
breaks = c(0, 1, 10, 100, 1000, 10000)) +
labs(x = '\nNumber of Followers',
y = 'Density\n') +
theme(legend.text = element_text(size = 40),
legend.title = element_text(size = 40),
legend.position = c(0.8, 0.8),
axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust=0.5),
axis.text.y = element_text(size = 40),
axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4b.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
ggplot(df_user, aes(x = favourites_count_one)) + # figure 4c
theme_bw() +
geom_histogram(aes(fill = Type, y = ..density..),
position = 'identity',
binwidth = 0.1,
alpha = 0.7) +
scale_fill_manual(values = c('red', 'skyblue')) +
scale_x_continuous(trans = 'log10',
labels = label_number(accuracy = 1),
breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
labs(x = '\nNumber of Likes',
y = 'Density\n') +
theme(legend.text = element_text(size = 40),
legend.title = element_text(size = 40),
legend.position = c(0.2, 0.8),
axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust=0.5),
axis.text.y = element_text(size = 40),
axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4c.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
ggplot(df_user, aes(x = statuses_count_one)) + # figure 4d
theme_bw() +
geom_histogram(aes(fill = Type, y = ..density..),
position = 'identity',
binwidth = 0.1,
alpha = 0.7) +
scale_fill_manual(values = c('red', 'skyblue')) +
scale_x_continuous(trans = 'log10',
labels = label_number(accuracy = 1),
breaks = c(0, 1, 10, 100, 1000, 10000, 100000)) +
labs(x = '\nNumber of Tweets', y = 'Density\n') +
theme(legend.text = element_text(size = 40),
legend.title = element_text(size = 40),
legend.position = c(0.2, 0.8),
axis.text.x = element_text(size = 40, angle = 0, vjust = 0, hjust = 0.5),
axis.text.y = element_text(size = 40),
axis.title = element_text(size = 40))
ggsave(paste0(path_output, 'fig4d.pdf'),
dpi = 600,
width = 20,
height = 10,
units = 'in')
# figure a10 -------------------------
user_median <- data.frame(
Friends =
c(median(df_user[which(df_user$violent == 1), ]$friends_count),
median(df_user[which(df_user$violent == 0), ]$friends_count)),
Followers =
c(median(df_user[which(df_user$violent == 1), ]$followers_count),
median(df_user[which(df_user$violent == 0), ]$followers_count)),
Likes =
c(median(df_user[which(df_user$violent == 1), ]$favourites_count),
median(df_user[which(df_user$violent == 0), ]$favourites_count)),
Tweets =
c(median(df_user[which(df_user$violent == 1), ]$statuses_count),
median(df_user[which(df_user$violent == 0), ]$statuses_count))
)
row.names(user_median) <- c('Violent', 'Non-violent')
write.csv(user_median, paste0(path_output, 'tbla10.csv'))
closeAllConnections() # Close connection to log file
my_log <- file('kim_psrm_figa1_tbla1_log.txt') # File name of output log
sink(my_log, append = TRUE, type = "output")
sink(my_log, append = TRUE, type = "message")
cat(readChar(rstudioapi::getSourceEditorContext()$path, file.info(rstudioapi::getSourceEditorContext()$path)$size))
###############################################################################
#### Replication Materials                                                 ####
#### Taegyoon Kim, 2022. Violent Political Rhetoric on Twitter.            ####
#### Political Science Research and Methods                                ####
###############################################################################
###############################################################################
################################### Set Up ####################################
###############################################################################
# packages -------------------------
lapply(c('readr', 'gridExtra', 'xtable', 'hexbin', 'ggplot2', 'dplyr',
'ggthemes', 'dplyr', 'tidyr', 'magrittr', 'scales'),
require,
character.only = TRUE)
###############################################################################
######################## Generate Figure A1 & Table A1 ########################
###############################################################################
# load count and handle data -------------------------
path_data <- '/Users/taegyoon/Google Drive/kim_psrm_replication/data/'
path_output <- '/Users/taegyoon/Google Drive/kim_psrm_replication/output/'
df_prop <- read_csv(paste0(path_data, 'df_prop.csv'))
# figure a1 -------------------------
df_prop_agg <- df_prop %>% # aggregate proportion data at the politician-level
group_by(handle) %>%
summarise(
party_bi = first(party_bi),
gender = first(gender),
handle = first(handle),
office = first(office),
name = first(name),
tweet_count_handle_sum = sum(tweet_count_handle),
tweet_count_name_sum = sum(tweet_count_name)
)
df_prop_agg$prop <- (df_prop_agg$tweet_count_name_sum + 0.001) / (df_prop_agg$tweet_count_handle_sum + 0.001) # dealing with zeros
ggplot(df_prop_agg, aes(x = prop)) +
geom_histogram(color = 'black',
fill = 'white',
bins = 15) +
scale_x_continuous(n.breaks = 10,
trans = log_trans(),
labels = number_format(accuracy = 0.01, scale = 100)) +
labs(x = '\nProportion (number of full name tweets / number of mention tweets)',
y = 'Number of Accounts') +
geom_vline(aes(xintercept = median(prop)),
colour = 'red',
linetype = 'longdash') +
ggplot2::annotate('text',
x = 0.55,
y = 130,
label = 'Median: 13.04%',
colour = 'red',
size = 6) +
theme_classic() +
theme(text = element_text(size = 17))
ggsave(paste0(path_output, 'figa1.pdf'),
dpi = 600,
width = 10,
height = 7,
units = 'in')
# table a1 -------------------------
prop_compare <- data.frame(
Type = c('Women', 'Men', 'Republican', 'Non-Republican', 'Governors', 'Senators', 'Representatives', 'Total'),
Proportion = c(median(df_prop_agg[which(df_prop_agg$gender=='F'),]$prop, digits = 1) * 100,
median(df_prop_agg[which(df_prop_agg$gender=='M'),]$prop) * 100,
median(df_prop_agg[which(df_prop_agg$party_bi=='R'),]$prop) * 100,
median(df_prop_agg[which(df_prop_agg$party_bi=='Non-R'),]$prop) * 100,
median(df_prop_agg[which(df_prop_agg$office=='governor'),]$prop) * 100,
median(df_prop_agg[which(df_prop_agg$office=='senator'),]$prop) * 100,
median(df_prop_agg[which(df_prop_agg$office=='representative'),]$prop) * 100,
median(df_prop_agg$prop) * 100
)
)
prop_compare <- prop_compare %>%
mutate(Proportion = round(prop_compare$Proportion, 1))
write.csv(prop_compare, paste0(path_output, 'tbla1.csv'))
closeAllConnections() # Close connection to log file
my_log <- file(‘kim_psrm_tbla3’) # File name of output log
my_log <- file('kim_psrm_tbla3') # File name of output log
sink(my_log, append = TRUE, type = "output")
sink(my_log, append = TRUE, type = "message")
cat(readChar(rstudioapi::getSourceEditorContext()$path, file.info(rstudioapi::getSourceEditorContext()$path)$size))
###############################################################################
#### Replication Materials                                                 ####
#### Taegyoon Kim, 2022. Violent Political Rhetoric on Twitter.            ####
#### Political Science Research and Methods                                ####
###############################################################################
###############################################################################
################################### Set Up ####################################
###############################################################################
# packages -------------------------
lapply(
c('psy', 'irr', 'readxl', 'RCurl', 'tidyverse',
'WriteXLS','DescTools','irrCAC','dplyr'),
require,
character.only = TRUE
)
# load data -------------------------
path_data <- '/Users/taegyoon/Google Drive/kim_psrm_replication/data/'
path_output <- '/Users/taegyoon/Google Drive/kim_psrm_replication/output/'
df_icr <- read_csv(paste0(path_data, 'df_icr.csv'),
col_types = cols(X1 = col_skip()))
###############################################################################
################################## Table A3 ###################################
###############################################################################
# pairs of coders -------------------------
c_kappa_1 <- kappa2(df_icr[, c("coder_1", "coder_2")],
weight = "unweighted") # 0.569
c_kappa_2 <- kappa2(df_icr[, c("coder_1", "coder_3")],
weight = "unweighted") # 0.622
c_kappa_3 <- kappa2(df_icr[, c("coder_2", "coder_3")],
weight = "unweighted") # 0.593
# all three coders -------------------------
l_kappa <- kappam.light(df_icr[, c('coder_1', 'coder_2', 'coder_3')]) # 0.595
f_kappa <- kappam.fleiss(df_icr[, c('coder_1', 'coder_2', 'coder_3')]) # 0.597
k_alpha <-krippen.alpha.raw(df_icr[, c('coder_1', 'coder_2', 'coder_3')]) # 0.597
icr <- data.frame(cohen_kappa_coder_1_2 = c_kappa_1$value,
cohen_kappa_coder_1_3 = c_kappa_2$value,
cohen_kappa_coder_2_3 = c_kappa_3$value,
light_kappa = l_kappa$value,
fless_kappa = f_kappa$value,
krippendorff_alpha = k_alpha$est[4][,1])
row.names(icr) <- 'score'
write.csv(icr, paste0(path_output, 'tbla3.csv'))
closeAllConnections() # Close connection to log file
